* code for cleaning up tracks data

gen main_sample = 1 
*distinct track_id if main_sample
*scalar cnt_start=r(ndistinct)

// drop months with abnormally low counts (known AIS issues - in NAIS data, usually missing days)
// documented here: https://www.fisheries.noaa.gov/inport/item/55360
//		December 2014 is reported as having low counts, but doesn't in practice
// not excluding from main data set, only flagging for use as robustness check
gen bad_months = (month(date)==11 & year(date)==2014) | (month(date)==6 & year(date)==2009)  // (month(date)==6 & year(date)==2009) 
* replace main_sample=0 if bad_months 	
* distinct track_id if main_sample
* scalar cnt_badMonths=r(ndistinct)

gen dist_time_outlier = ((hours/24)>10) | (dist_outlier==1)
gen dist_time_outlier_ec = ((hours/24)>10) | (dist_outlier_ec==1)
replace main_sample=0 if dist_time_outlier // % dist>3500 | 
// Getting rid of very long tracks
// this gets rid of the tracks that are connecting an entrance and exit well outside the ECA
// also dropping tracks that were longer than 10 days, which are likely connecting two journies 
*replace main_sample=0 if incVessel==0
*distinct track_id if main_sample
*scalar cnt_noToolong=r(ndistinct)

* no vessel characteristics
replace main_sample=0 if merged_chars==0
*distinct track_id if main_sample
*scalar cnt_noChars_miss=r(ndistinct)

* has a max observed speed on route that is too fast
gen speed_outlier = (kmh_max>60)
replace main_sample=0 if speed_outlier==1
*distinct track_id if main_sample
*scalar cnt_badSpeed=r(ndistinct)

gen main_sample_ec = (bad_months==0) & (speed_outlier==0) & (dist_time_outlier_ec==0) & (merged_chars==1)

// This describes the number of tracks dropped for various reasons
*display "Total: " cnt_start
*display "Dropped, vessel type: " cnt_start-cnt_noVessel
*display "Total, right vessel type: " cnt_noVessel

*display "Dropped, not major port: " cnt_noVessel-cnt_noMajor
*display "Total, major ports: " cnt_noMajor

*display "Dropped, not classified: " cnt_noMajor-cnt_noclass
*display "Total, major ports and classified: " cnt_noclass

*display "Dropped, same port: " cnt_noclass-cnt_noSameport
*display "Total after basic cleaning: " cnt_noSameport

*display "Dropped, bad months: " cnt_start-cnt_badMonths
*display "Dropped, too long: " cnt_badMonths-cnt_noToolong
*display "Dropped, missing characteristics: " cnt_noToolong-cnt_noChars_miss
*display "Dropped, bad max speed: " cnt_noChars_miss-cnt_badSpeed
*display "Total after final cleaning: " cnt_badSpeed



* other conditions for dropping
*drop if portslow==1 // dropping if
*drop if dist_outlier==0

/*

* dropping tracks with really short lengths (<10km)
* SD to south exit is 14km, so these are segments that don't connect to anything.
* they won't actually matter except for in summary stats
drop if dist<10

tab vesseltype if main_sample==1
// drop all vessels that are not cargo ships or tankers	
replace main_sample=0 if incVessel==0
distinct track_id if main_sample
scalar cnt_noVessel=r(ndistinct)

// drop all tracks that do not move to/from a MAJOR west coast port
replace main_sample=0 if  MAJORport==0	| hitsMAJORport	== 0
distinct track_id if main_sample
scalar cnt_noMajor=r(ndistinct)

tab route_id main_sample , miss 
tab port1 port2 if main_sample, miss

// drop all tracks that we can't classify to a route
replace main_sample=0 if  classified==0   	
distinct track_id if main_sample
scalar cnt_noclass=r(ndistinct)

replace main_sample=0 if samePort==1 // getting rid of trips to/from same port 
				    // some of these are anomolies (ships exiting zone then returning on same track much later)
					// and not reflective of our analysis
distinct track_id if main_sample
scalar cnt_noSameport=r(ndistinct)

*/
